# Extract all foreign language Manifestos with Scores

library(SnowballC)
library(manifestoR)
library(tidyverse)
library(tidytext)

# This part needs an API key from the Manifesto Project;
# you can get one at https://manifesto-project.wzb.eu/

key <- mp_setapikey('manifesto_apikey.txt')

countries <- c("Australia", "Austria", "Belgium", "Canada", "France", "Germany",
               "Ireland", "Italy", "Netherlands", "New Zealand", "Norway",
               "Portugal", "Spain", "Switzerland", "United Kingdom",
               "United States")

corpus <- mp_corpus(subset(mp_maindataset(), countryname %in% countries))

#save(corpus, file = "corpus.RData")

#load("corpus.RData")

corpus <- tidy(corpus)

corpus <- corpus %>%
  select(1:5, 15:17) %>%
  filter(!language %in% c("catalan", "galician")) %>%
  mutate(text = text %>%
           removePunctuation() %>%
           tolower()) 

languages <- unique(corpus$language)

# This takes a while to run; it's doing a bunch of cleaning on large
# chunks of text. The for loop lets you know when it's done for each language

for(i in languages) {
  corpus %>% filter(language == i) %>%
    mutate(clean_text = text %>% str_replace_all("\\n", " ") %>%
             removeWords(words = stopwords(i)) %>%
             str_squish() %>%
             str_replace_all("\\n", " ")) %>%
    write_csv(file = str_glue("corpus/corpus_{i}.csv"))
  
  cat("\n done", i)
}

list.files("corpus") %>% str_subset("LIWC")

rm(corpus)

#############
# At this point, we ran all the cleaned files (each with all manifestos
# in a given language) through the LIWC dictionary program.
# This also took a while and we had to buy a license, but we can provide our
# serial code if you want to rerun independently
#############

# We then load the new files and proceed with the construction of the dataset

df <- map_dfr(languages, function(x) {
  d <- read_csv(str_glue("corpus/LIWC-22 Results - corpus_{x} - LIWC Analysis.csv"),
                n_max = 100, col_select = c(1:6, 11:13))
  names(d) <- c("manifesto_id", "party", "date", "language",
                "source", "title",
                "word_count", "vague", "concrete")
  return(d)
})

# That's secondary data also used by the authors of the paper we're replicating

sw <- haven::read_dta("Seki-Williams Governments--Version 2.0.dta")

sw <- sw %>% unite(parties, starts_with("mpppy"), sep = ", ", na.rm = T) %>%
  mutate(parties = str_split(parties, ", "))

codes <- sw %>% select(marpor, country, ccode) %>% distinct %>%
  filter(country != "France IV")

df <- df %>% mutate(net_concreteness = concrete - vague,
                    year = str_sub(date, 1, 4),
                    marpor = str_sub(party, 1, 2) %>% as.numeric) %>%
  left_join(codes, by = "marpor") %>%
  filter(country != "Austria") %>%
  arrange(country, date)

sw2 <- sw %>%
  filter(marpor %in% df$marpor) %>%
  arrange(country, startyear, startmonth, startday) %>%
  mutate(date = as.numeric(paste0(neyear, sprintf("%0.2d", nemonth))),
         gov = as.numeric(paste0(startyear,
                                 sprintf("%0.2d", startmonth),
                                 sprintf("%0.2d", startday))
         )) %>%
  filter(!is.na(date)) %>%
  group_by(country, date) %>%
  filter(gov == max(gov)) %>%
  select(country, marpor, date, gov,
         mpp_pm, 
         parties
  )

x <- df %>% left_join(sw2, by = c("marpor", "date")) %>%
  filter(!is.na(mpp_pm) & parties != "NULL") %>%
  rowwise %>%
  mutate(pm = party == mpp_pm,
         partner = party %in% parties) %>%
  ungroup %>%
  mutate(country = case_when(country.x == "France V" ~ "France",
                             country.x == "Great Britain" ~ "United Kingdom",
                             TRUE ~ country.x),
         year = as.numeric(year))

cpd <- haven::read_dta("CPDS_1960-2020_Update_2022.dta") %>%
  filter(country %in% x$country)

cpd <- cpd %>% group_by(country) %>%
  mutate(lag_gdp = lag(realgdpgr, order_by = year),
         prop_dummy = if_else(prop %in% c(0,1), 0, 1)) %>%
  select(country, year, prop, prop_dummy, lag_gdp)

x <- left_join(x, cpd, by = c("country", "year")) %>%
  mutate(new_inc = case_when(pm == T ~ "Prime Minister",
                             partner == T & pm == F ~ "Partner",
                             partner == F & pm == F ~ "Opposition"),
         government = as.numeric(pm | partner))

df <- x

save(df, file = "extension.RData")
